The objective of this markdown is to give an overview of the big
dataset COVID19_ALL.h5ad as well as the smaller “pilot” we
have created. For detail on how the smaller dataset was created see
Clean_data_subsetting.Rmd.
The data objects are AnnData objects. If you are confused about the structure see this.
library(tidyverse)
library(anndata)
setwd("~/sc_covid_PiB2023/data_science_project/Code") # changing the working directory
pilot <- read_h5ad('../Data/Pilot_2_rule_them_ALL.h5ad')
# Loading dthe large dataset
All_data <- read_h5ad("../../data/sc_covid/COVID19_ALL.h5ad")
# be patient it takes about 15 min to load with 128 G of RAM available
We’ll save the annotations in a dataframe to make it easier to work with ggplot:
pilot_obs <- data.frame(pilot$obs)
All_data_obs <- data.frame(All_data$obs)
This is just to give an overview of the size of the expression matrix for both. It’s \(cell \times genes\):
c(All_data$n_obs, All_data$n_vars)
## [1] 1462702 27943
1,462,702 cells and 27,943 genes
dim(pilot$X)
## [1] 5694 23382
5,694 cells and 23,382 genes.
Overview of the annotations in both
colnames(All_data_obs)
## [1] "celltype"
## [2] "majorType"
## [3] "sampleID"
## [4] "PatientID"
## [5] "datasets"
## [6] "City"
## [7] "Age"
## [8] "Sex"
## [9] "Sample.type"
## [10] "CoVID.19.severity"
## [11] "Sample.time"
## [12] "Sampling.day..Days.after.symptom.onset."
## [13] "SARS.CoV.2"
## [14] "Single.cell.sequencing.platform"
## [15] "BCR.single.cell.sequencing"
## [16] "TCR.single.cell.sequencing"
## [17] "Outcome"
## [18] "Comorbidities"
## [19] "COVID.19.related.medication.and.anti.microbials"
## [20] "Leukocytes..G.L."
## [21] "Neutrophils..G.L."
## [22] "Lymphocytes..G.L."
## [23] "Unpublished"
colnames(pilot_obs)
## [1] "cellType" "viralLoad" "PatientID"
## [4] "sampleID" "Is_infected" "Age"
## [7] "Sex" "CoVID.19.severity"
There are a lot of different celltypes in All_data
length(unique(All_data$obs$celltype))
## [1] 64
So it will not yeild a pretty plot. But we’ll try anyway:
ggplot(All_data_obs, aes(celltype, fill = celltype))+
geom_bar(stat = "count" ) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 6),
legend.text = element_text(size = 5), # Adjust the size parameter for legend text
legend.title = element_text(size = 7), # Adjust the size parameter for the legend title
legend.key.size = unit(0.4, "cm"))
The annotation majorType is much closer to
cellType in the pilot.
ggplot(All_data_obs, aes(majorType, fill = majorType))+
geom_bar(stat = "count" ) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 3) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(pilot_obs, aes(cellType, fill = cellType))+
geom_bar(stat = "count" ) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = +1.5) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
We’ll also have to make a different dataframe such that everything related to the patient does not repeat over multiple rows
# for All_data:
All_data_df <- All_data_obs %>%
select(c("PatientID","City","Age","Sex","CoVID.19.severity")) %>%
unique()
# for the pilot:
pilot_df <- pilot_obs %>%
select(c("PatientID", "Age", "Sex", "CoVID.19.severity")) %>%
unique()
They made a mistake when creating the large dataset:
All_data_df[All_data_df$PatientID=="P-M026",]
## PatientID City Age Sex CoVID.19.severity
## AAACCTGAGGATATAC-136 P-M026 Chongqing 32 F mild/moderate
## AAACCTGAGGTGCACA-140 P-M026 Chongqing 62 F mild/moderate
Somehow this woman with patient id P-M026 is simultaniously 32 and 62 years old
ggplot(All_data_df, aes(Sex, fill = Sex))+
geom_bar(stat = "count" ) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5)
ggplot(pilot_df, aes(Sex, fill = Sex))+
geom_bar(stat = "count" ) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = +2)
All_ages <- as.numeric(as.character(All_data_df$Age[All_data_df$Age != "unknown"]))
summary(All_ages)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 35.00 49.00 49.22 62.00 92.00
as.data.frame(All_ages) %>%
ggplot(aes(All_ages))+
xlab("Age")+
geom_bar()
pilot_ages <- as.numeric(as.character(pilot_df$Age))
summary(pilot_ages)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 46.00 56.50 61.50 60.25 63.75 72.00
as.data.frame(pilot_ages) %>%
ggplot(aes(pilot_ages))+
geom_bar(stat = "count" ) +
xlab("Age")
ggplot(All_data_df, aes(CoVID.19.severity, fill = CoVID.19.severity))+
geom_bar(stat = "count" ) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = 2)
unique(pilot_df$CoVID.19.severity)
## [1] severe/critical
## Levels: severe/critical
We have some potentially interesting genes from
Clean_PCA.Rmd that were responsible for most of the
varience; MALAT1, B2M, MT-CO2, FTL, MT-CO1.
Furthermore it might be fun to look at the highest expressed genes (on average) as well as the genes that has the largest variance (we only do this for the pilot since the large dataset would take wayyyyy too long). We first find the most expressed genes and the genes with the most variance:
AvExpression <- apply(pilot$X, 2, mean) # Compute average gene expression across cell types
varExpression <- apply(pilot$X, 2, var) # Compute variance of expression of genes
M = 3 # number of most expressed cells we want
Av_genes <- names(AvExpression[order(AvExpression, decreasing = T)][1:M]) # Find M most expressed genes
var_genes <- names(varExpression[order(varExpression, decreasing = T)][1:M]) # Find M most expressed genes(or varience)
# Extract the expression matrix for the given genes as a dense matrix
matrix_av <- as.data.frame(as.matrix(pilot[, (Av_genes)]$X)) %>% cbind(pilot$obs)
matrix_var <- as.data.frame(as.matrix(pilot[, (var_genes)]$X)) %>% cbind(pilot$obs)
Av_genes
## [1] "MALAT1" "B2M" "MT-CO2"
Boxplot showing the expression of a specified gene for all cells of each cell type (colored by viral load).
# "MALAT1"
matrix_av %>%
pivot_longer(cols = "MALAT1") %>%
ggplot(aes(x = cellType, y = value, col = viralLoad))+
scale_color_gradient(low='blue', high = 'red')+
geom_boxplot() +
geom_jitter(alpha = 0.35, size = 0.4) +
facet_wrap(~name)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "right") +
labs(y = "Expression Count (Normalized)", x = "Cell Type")
# "B2M"
matrix_av %>%
pivot_longer(cols = "B2M") %>%
ggplot(aes(x = cellType, y = value, col = viralLoad))+
scale_color_gradient(low='blue', high = 'red')+
geom_boxplot() +
geom_jitter(alpha = 0.35, size = 0.4) +
facet_wrap(~name)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "right") +
labs(y = "Expression Count (Normalized)", x = "Cell Type")
# "MT-CO2"
matrix_av %>%
pivot_longer(cols = "MT-CO2") %>%
ggplot(aes(x = cellType, y = value, col = viralLoad))+
scale_color_gradient(low='blue', high = 'red')+
geom_boxplot() +
geom_jitter(alpha = 0.35, size = 0.4) +
facet_wrap(~name)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "right") +
labs(y = "Expression Count (Normalized)", x = "Cell Type")
The same is done for the genes that have the most variance:
var_genes
## [1] "CCL2" "SLPI" "S100A8"
# "CCL2"
matrix_var %>%
pivot_longer(cols = "CCL2") %>%
ggplot(aes(x = cellType, y = value, col = viralLoad))+
scale_color_gradient(low='blue', high = 'red')+
geom_boxplot() +
geom_jitter(alpha = 0.35, size = 0.4) +
facet_wrap(~name)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "right") +
labs(y = "Expression Count (Normalized)", x = "Cell Type")
# "SLPI"
matrix_var %>%
pivot_longer(cols = "SLPI") %>%
ggplot(aes(x = cellType, y = value, col = viralLoad))+
scale_color_gradient(low='blue', high = 'red')+
geom_boxplot() +
geom_jitter(alpha = 0.35, size = 0.4) +
facet_wrap(~name)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "right") +
labs(y = "Expression Count (Normalized)", x = "Cell Type")
# "S100A8"
matrix_var %>%
pivot_longer(cols = "S100A8") %>%
ggplot(aes(x = cellType, y = value, col = viralLoad))+
scale_color_gradient(low='blue', high = 'red')+
geom_boxplot() +
geom_jitter(alpha = 0.35, size = 0.4) +
facet_wrap(~name)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "right") +
labs(y = "Expression Count (Normalized)", x = "Cell Type")
A heatmap showing the expression of the \(M\) most expressed genes (on average) for \(M\) randomly sampled cells:
M = 10 # amount of genes and cells
AvExpression <- apply(pilot$X, 2, mean)
MostExpressed <- names(AvExpression[order(AvExpression, decreasing = T)][1:M]) # Selecting the most expressed genes
genes <- MostExpressed # Insert genes, you wish to heat map
samples <- sample(1:5694, M)
heatMapData <- as.data.frame(as.matrix(pilot[(samples), (genes)]$X)) # M x genes data frame is selected
heatMapData %>%
rownames_to_column() %>%
gather(colname, value, -rowname) %>%
ggplot(aes(y = rowname, x = colname, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "yellow", high = "purple") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(title = "Heatmap of Selected Gene Expression Values", subtitle = "For Random Sample") +
xlab("Genes") +
ylab("Sample ID")